Research Question: Can we identify tipping point neighborhoods where eviction filings will surge next month to help with preemptive rental assistance distribution?
Method: Negative Binomial regression with rolling window cross-validation using monthly tract-level panel data. This could be an issue bc the data is quite zero inflated.
Data Scope: January 2020 - November 2025 eviction filings across Philadelphia census tracts with weekly and monthly granularity.
Key Stuff:
Extreme zero-inflation (60-70% zeros) is p bad for Negative Binomial …
Strong seasonality with summer-fall peaks requiring month fixed effects.
Moratorium policy created structural break from March 2020 through September 2021.
High-frequency weekly data captures volatility but monthly data better for forecasting.
Setup and Initial Data Loading
Code
# Load libraries.library(tidyverse)library(lubridate)library(sf)library(scales)library(patchwork)library(viridis)library(kableExtra)library(corrplot)library(ggridges)library(forecast)library(zoo)library(knitr)# Disable scientific notation.options(scipen =999)# Consistent theme.theme_set(theme_minimal(base_size =12))# Download viz when knitting.opts_chunk$set(fig.path ="eviction_figures/", dev ="png")
Monthly Tract-Level Data
Code
# Load monthly eviction filings data at census tract level.df_monthly_raw <-read.csv("data/eviction/philadelphia_monthly_2020_2021.csv")# Initial data structure and dimensions.print(dim(df_monthly_raw))
# First few rows to understand structure.head(df_monthly_raw, 10) %>%kable() %>%kable_styling(bootstrap_options =c("striped", "hover", "condensed"))
type
GEOID
racial_majority
month
filings_2020
filings_avg
filings_avg_prepandemic_baseline
last_updated
Census Tract
42101000101
White
01/2020
0
2.5
0.75
Census Tract
42101000101
White
02/2020
0
1.5
1.00
Census Tract
42101000101
White
03/2020
0
1.5
1.00
Census Tract
42101000101
White
04/2020
0
1.0
1.75
Census Tract
42101000101
White
05/2020
0
2.0
1.75
Census Tract
42101000101
White
06/2020
0
1.0
2.00
Census Tract
42101000101
White
07/2020
0
2.5
0.50
Census Tract
42101000101
White
08/2020
1
1.5
0.75
Census Tract
42101000101
White
09/2020
2
0.0
3.50
Census Tract
42101000101
White
10/2020
2
1.5
1.50
Weekly Tract-Level Data
Code
# Load weekly eviction filings data for high-frequency analysis.df_weekly_raw <-read.csv("data/eviction/philadelphia_weekly_2020_2021.csv")# Weekly data structure.print(dim(df_weekly_raw))
# Create histogram data for raw and log-transformed counts.df_hist_monthly <- df_monthly %>%mutate(Log_Count =log10(filings_count +1),Scale =factor("Raw Count", levels =c("Raw Count", "Log10(Count + 1)")) )# Duplicate data for log-transformed panel.df_log_monthly <- df_hist_monthly %>%mutate(filings_count = Log_Count,Scale =factor("Log10(Count + 1)", levels =c("Raw Count", "Log10(Count + 1)")) )
High zero percentage could use zero-inflated Negative Binomial model, but I think we’re restricted to just regular NB, lmk if I’m wrong. Dispersion ratio > 1 justifies Negative Binomial over Poisson.
Code
# Combine for faceted viz.df_final_hist_monthly <-bind_rows(df_hist_monthly, df_log_monthly)# Create 1x2 faceted histogram showing raw and log distributions.dist_plot_monthly <-ggplot(df_final_hist_monthly, aes(x = filings_count)) +geom_histogram(bins =50, fill ="#E74C3C", color ="white", alpha =0.8) +scale_x_continuous(labels = comma) +scale_y_continuous(labels = comma) +labs(title ="Distribution of Monthly Eviction Filings per Census Tract",subtitle =sprintf("%.1f%% zeros justify Zero-Inflated Negative Binomial (ZINB) model.\nNot sure if in project toolbox.", zero_stats_monthly$Zero_Pct),x ="Filings Count",y ="Frequency (Number of Observations)",caption ="Data: Philadelphia Eviction Filings 2020-2025 | Monthly tract-level observations" ) +facet_wrap(~ Scale, scales ="free", ncol =2) +theme_minimal(base_size =12) +theme(plot.title =element_text(face ="bold", size =14),strip.text =element_text(face ="bold") )# Display distribution plot demonstrating zero-inflation.dist_plot_monthly
# Calculate concentration statistics.total_system_filings <-sum(tract_totals$Total_Filings)top_20_pct <-sum(top_risk_tracts$Total_Filings) / total_system_filings *100cat(sprintf("Concentration: Top 20 tracts account for %.1f%% of all filings\n", top_20_pct))
Concentration: Top 20 tracts account for 17.3% of all filings
Pre-Pandemic Baseline Comparison
Weekly Filings vs Baseline Ratio
Code
# Calculate system-wide ratio to pre-pandemic baseline.weekly_baseline_ratio <- df_weekly %>%group_by(date) %>%summarize(Total_Filings =sum(filings_count, na.rm =TRUE),Total_Baseline =sum(filings_avg_prepandemic_baseline, na.rm =TRUE),.groups ="drop" ) %>%filter(Total_Baseline >0) %>%mutate(Ratio_to_Baseline = Total_Filings / Total_Baseline)# Create line plot showing normalized recovery.baseline_ratio_plot <-ggplot(weekly_baseline_ratio, aes(x = date, y = Ratio_to_Baseline)) +geom_line(color ="#16A085", linewidth =0.8, alpha =0.6) +geom_smooth(method ="loess", se =TRUE, color ="#E74C3C", linewidth =1.2, span =0.15) +geom_hline(yintercept =1, linetype ="dashed", color ="black", linewidth =1) +annotate("text", x =min(weekly_baseline_ratio$date), y =1.05,label ="Pre-Pandemic Average", hjust =0, fontface ="bold") +geom_vline(xintercept =as.Date("2020-03-01"), linetype ="dotted", color ="gray50", alpha =0.7) +geom_vline(xintercept =as.Date("2021-09-30"), linetype ="dotted", color ="gray50", alpha =0.7) +labs(title ="Weekly Filings Normalized to Pre-Pandemic Baseline",subtitle ="Ratio > 1 indicates system stress relative to historical norm | Smoothed trend in red",x ="Date (Week Ending)",y ="Weekly Filings / Pre-Pandemic Average Ratio",caption ="Data: Philadelphia Weekly Eviction Filings" ) +theme_minimal(base_size =12) +theme(plot.title =element_text(face ="bold", size =14))baseline_ratio_plot
Code
# Calculate summary statistics by period.weekly_baseline_ratio %>%mutate(Period =case_when( date <as.Date("2020-03-01") ~"Pre-Moratorium", date >=as.Date("2020-03-01") & date <=as.Date("2021-09-30") ~"Moratorium", date >as.Date("2021-09-30") ~"Post-Moratorium" )) %>%group_by(Period) %>%summarize(Mean_Ratio =mean(Ratio_to_Baseline, na.rm =TRUE),Median_Ratio =median(Ratio_to_Baseline, na.rm =TRUE),.groups ="drop" ) %>%kable(digits =2) %>%kable_styling(bootstrap_options =c("striped", "hover"))
Reading layer `pa_tracts' from data source
`C:\Users\Tess\Desktop\UPenn\UPenn_FW25\MUSA_5080-401_Public_Policy_Analytics\shark-tank\data\pa_tracts\pa_tracts.shp'
using driver `ESRI Shapefile'
Simple feature collection with 3446 features and 12 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -80.51985 ymin: 39.7198 xmax: -74.68956 ymax: 42.51607
Geodetic CRS: NAD83
Code
# Calculate total filings per tract across the entire study period.df_monthly_agg <- df_monthly %>%group_by(GEOID) %>%summarise(Total_Filings =sum(filings_count, na.rm =TRUE),.groups ="drop" ) %>%mutate(GEOID =as.character(GEOID))# Perform spatial join for long-term risk.tract_map_monthly <- tract_geo %>%left_join(df_monthly_agg, by ="GEOID")
Code
# Calculate the average weekly filings per tract across the study period.df_weekly_agg <- df_weekly %>%group_by(GEOID) %>%summarise(Avg_Weekly_Filings =mean(filings_count, na.rm =TRUE),.groups ="drop" ) %>%mutate(GEOID =as.character(GEOID))# Average weekly filings per tract calculated for short-term risk map.tract_map_weekly <- tract_geo %>%left_join(df_weekly_agg, by ="GEOID")
Choropleths
Code
# Define a common color scale for visual consistency.common_fill_scale <-scale_fill_viridis_c(trans ="log10",labels = comma,na.value ="gray70",direction =-1,name ="Count (Log)"# Add explicit shared legend name)# Total Long-Term Filings (Overall Risk)map_total_risk <-ggplot(tract_map_monthly) +geom_sf(aes(fill = Total_Filings +1), color ="white", linewidth =0.1) + common_fill_scale +labs(title ="Total Eviction Filings (2020-2023)",subtitle ="Long-term risk and structural disparity." ) +coord_sf(datum =NA) +theme_void()# Average Weekly Filings (Volatility/Short-Term Risk)map_weekly_avg <-ggplot(tract_map_weekly) +geom_sf(aes(fill = Avg_Weekly_Filings +1), color ="white", linewidth =0.1) + common_fill_scale +labs(title ="Average Weekly Filings",subtitle ="Normalized view of short-term filing activity." ) +coord_sf(datum =NA) +theme_void()# Map showing average monthly and weekly eviction filings.(map_total_risk | map_weekly_avg) +plot_layout(guides ="collect") &theme(legend.position ="bottom", legend.direction ="horizontal")